Visualization of business product information and text processing results.¶

import packages

In [ ]:
#command:jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10
import nltk
import string
import re
import numpy as np
import pandas as pd
import pickle
#import lda

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="white")

from nltk.stem.porter import *
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words

from collections import Counter
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
%matplotlib inline

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, show, output_notebook
#from bokeh.transform import factor_cmap

import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger("lda").setLevel(logging.WARNING)
In [ ]:
train = pd.read_csv('train.tsv', sep='\t')
test = pd.read_csv('test.tsv', sep='\t')
In [ ]:
# size of training and dataset
print(train.shape)
print(test.shape)
(1482535, 8)
(693359, 7)
In [ ]:
# different data types in the dataset: categorical (strings) and numeric
train.dtypes
Out[ ]:
train_id               int64
name                  object
item_condition_id      int64
category_name         object
brand_name            object
price                float64
shipping               int64
item_description      object
dtype: object
In [ ]:
train.head()
Out[ ]:
train_id name item_condition_id category_name brand_name price shipping item_description
0 0 MLB Cincinnati Reds T Shirt Size XL 3 Men/Tops/T-shirts NaN 10.0 1 No description yet
1 1 Razer BlackWidow Chroma Keyboard 3 Electronics/Computers & Tablets/Components & P... Razer 52.0 0 This keyboard is in great condition and works ...
2 2 AVA-VIV Blouse 1 Women/Tops & Blouses/Blouse Target 10.0 1 Adorable top with a hint of lace and a key hol...
3 3 Leather Horse Statues 1 Home/Home Décor/Home Décor Accents NaN 35.0 1 New with tags. Leather horses. Retail for [rm]...
4 4 24K GOLD plated rose 1 Women/Jewelry/Necklaces NaN 44.0 0 Complete with certificate of authenticity

Process the suggested price we will provide using log transformation.

In [ ]:
train.price.describe()
Out[ ]:
count    1.482535e+06
mean     2.673752e+01
std      3.858607e+01
min      0.000000e+00
25%      1.000000e+01
50%      1.700000e+01
75%      2.900000e+01
max      2.009000e+03
Name: price, dtype: float64

Comparison of the distribution of price attributes before and after transformation.

In [ ]:
plt.subplot(1, 2, 1)
(train['price']).plot.hist(bins=50, figsize=(20,10), edgecolor='white',range=[0,250])
plt.xlabel('price+', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.tick_params(labelsize=15)
plt.title('Price Distribution - Training Set', fontsize=17)

plt.subplot(1, 2, 2)
np.log(train['price']+1).plot.hist(bins=50, figsize=(20,10), edgecolor='white')
plt.xlabel('log(price+1)', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.tick_params(labelsize=15)
plt.title('Log(Price) Distribution - Training Set', fontsize=17)
plt.show()

Shipping cost: Approximately 55% of sellers bear the shipping cost.

In [ ]:
train.shipping.value_counts()/len(train)
Out[ ]:
0    0.552726
1    0.447274
Name: shipping, dtype: float64

Look at the price changes under different shipping situations.

In [ ]:
prc_shipBySeller = train.loc[train.shipping==1, 'price']
prc_shipByBuyer = train.loc[train.shipping==0, 'price']
In [ ]:
fig, ax = plt.subplots(figsize=(20,10))
ax.hist(np.log(prc_shipBySeller+1), color='#8CB4E1', alpha=1.0, bins=50,
       label='Price when Seller pays Shipping')
ax.hist(np.log(prc_shipByBuyer+1), color='#007D00', alpha=0.7, bins=50,
       label='Price when Buyer pays Shipping')
ax.set(title='Histogram Comparison', ylabel='% of Dataset in Bin')
plt.legend()
plt.xlabel('log(price+1)', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.title('Price Distribution by Shipping Type', fontsize=17)
plt.tick_params(labelsize=15)
plt.show()

It seems that the average price for users who pay for shipping themselves is lower than for sellers who offer free shipping...

Product category

In [ ]:
print("There are %d unique values in the category column." % train['category_name'].nunique())
There are 1287 unique values in the category column.
In [ ]:
# TOP 5 RAW CATEGORIES
train['category_name'].value_counts()[:5]
Out[ ]:
Women/Athletic Apparel/Pants, Tights, Leggings    60177
Women/Tops & Blouses/T-Shirts                     46380
Beauty/Makeup/Face                                34335
Beauty/Makeup/Lips                                29910
Electronics/Video Games & Consoles/Games          26557
Name: category_name, dtype: int64
In [ ]:
# missing categories
print("There are %d items that do not have a label." % train['category_name'].isnull().sum())
There are 6327 items that do not have a label.

Subdivide the category

In [ ]:
def split_cat(text):
    try: return text.split("/")
    except: return ("No Label", "No Label", "No Label")
    
train['general_cat'], train['subcat_1'], train['subcat_2'] = \
zip(*train['category_name'].apply(lambda x: split_cat(x)))
train.head()
Out[ ]:
train_id name item_condition_id category_name brand_name price shipping item_description general_cat subcat_1 subcat_2
0 0 MLB Cincinnati Reds T Shirt Size XL 3 Men/Tops/T-shirts NaN 10.0 1 No description yet Men Tops T-shirts
1 1 Razer BlackWidow Chroma Keyboard 3 Electronics/Computers & Tablets/Components & P... Razer 52.0 0 This keyboard is in great condition and works ... Electronics Computers & Tablets Components & Parts
2 2 AVA-VIV Blouse 1 Women/Tops & Blouses/Blouse Target 10.0 1 Adorable top with a hint of lace and a key hol... Women Tops & Blouses Blouse
3 3 Leather Horse Statues 1 Home/Home Décor/Home Décor Accents NaN 35.0 1 New with tags. Leather horses. Retail for [rm]... Home Home Décor Home Décor Accents
4 4 24K GOLD plated rose 1 Women/Jewelry/Necklaces NaN 44.0 0 Complete with certificate of authenticity Women Jewelry Necklaces
In [ ]:
# repeat the same step for the test set
test['general_cat'], test['subcat_1'], test['subcat_2'] = \
zip(*test['category_name'].apply(lambda x: split_cat(x)))
In [ ]:
print("There are %d unique first sub-categories." % train['subcat_1'].nunique())
There are 114 unique first sub-categories.
In [ ]:
print("There are %d unique second sub-categories." % train['subcat_2'].nunique())
There are 871 unique second sub-categories.

Overall, we have 7 main categories (114 in the first subcategory and 871 in the second subcategory): Women and beauty items are the most popular categories (over 50% of observations), followed by children and electronic products.

Distribution of major categories:

In [ ]:
x = train['general_cat'].value_counts().index.values.astype('str')
y = train['general_cat'].value_counts().values
pct = [("%.2f"%(v*100))+"%"for v in (y/len(train))]
In [ ]:
#import plotly.offline as py
#py.init_notebook_mode(connected=True)
#import plotly.graph_objs as go
#import plotly.tools as tls

trace1 = go.Bar(x=x, y=y, text=pct)
layout = dict(title= 'Number of Items by Main Category',
              yaxis = dict(title='Count'),
              xaxis = dict(title='Category'))
fig=dict(data=[trace1], layout=layout)
py.iplot(fig)

Distribution of subcat_1

In [ ]:
x = train['subcat_1'].value_counts().index.values.astype('str')[:15]
y = train['subcat_1'].value_counts().values[:15]
pct = [("%.2f"%(v*100))+"%"for v in (y/len(train))][:15]
In [ ]:
trace1 = go.Bar(x=x, y=y, text=pct,
                marker=dict(
                color = y,colorscale='Portland',showscale=True,
                reversescale = False
                ))
layout = dict(title= 'Number of Items by Sub Category (Top 15)',
              yaxis = dict(title='Count'),
              xaxis = dict(title='SubCategory'))
fig=dict(data=[trace1], layout=layout)
py.iplot(fig)
In [ ]:
general_cats = train['general_cat'].unique()
x = [train.loc[train['general_cat']==cat, 'price'] for cat in general_cats]
In [ ]:
data = [go.Box(x=np.log(x[i]+1), name=general_cats[i]) for i in range(len(general_cats))]
In [ ]:
layout = dict(title="Price Distribution by General Category",
              yaxis = dict(title='Frequency'),
              xaxis = dict(title='Category'))
fig = dict(data=data, layout=layout)
py.iplot(fig)

Brand name

In [ ]:
print("There are %d unique brand names in the training dataset." % train['brand_name'].nunique())
There are 4809 unique brand names in the training dataset.
In [ ]:
x = train['brand_name'].value_counts().index.values.astype('str')[:10]
y = train['brand_name'].value_counts().values[:10]
In [ ]:
trace1 = go.Bar(x=x, y=y, 
                 marker=dict(
                 color = y,colorscale='Portland',showscale=True,
                 reversescale = False
                 ))
layout = dict(title= 'Top 10 Brand by Number of Items',
               yaxis = dict(title='Brand Name'),
              xaxis = dict(title='Count'))
fig=dict(data=[trace1], layout=layout)
py.iplot(fig)

Product description

Because it is unstructured data, parsing this specific item will be more challenging. Does this mean that more detailed and longer descriptions lead to higher bids? We will remove all punctuation, remove some English stop words (i.e. redundant words such as "a," "the," etc.) as well as any other words of length less than 3.

In [ ]:
def wordCount(text):
    # convert to lower case and strip regex
    try:
         # convert to lower case and strip regex
        text = text.lower()
        regex = re.compile('[' +re.escape(string.punctuation) + '0-9\\r\\t\\n]')
        txt = regex.sub(" ", text)
        # tokenize
        # words = nltk.word_tokenize(clean_txt)
        # remove words in stop words
        words = [w for w in txt.split(" ") \
                 if not w in stop_words.ENGLISH_STOP_WORDS and len(w)>3]
        return len(words)
    except: 
        return 0
In [ ]:
# add a column of word counts to both the training and test set
train['desc_len'] = train['item_description'].apply(lambda x: wordCount(x))
test['desc_len'] = test['item_description'].apply(lambda x: wordCount(x))
In [ ]:
train.head()
Out[ ]:
train_id name item_condition_id category_name brand_name price shipping item_description general_cat subcat_1 subcat_2 desc_len
0 0 MLB Cincinnati Reds T Shirt Size XL 3 Men/Tops/T-shirts NaN 10.0 1 No description yet Men Tops T-shirts 1
1 1 Razer BlackWidow Chroma Keyboard 3 Electronics/Computers & Tablets/Components & P... Razer 52.0 0 This keyboard is in great condition and works ... Electronics Computers & Tablets Components & Parts 14
2 2 AVA-VIV Blouse 1 Women/Tops & Blouses/Blouse Target 10.0 1 Adorable top with a hint of lace and a key hol... Women Tops & Blouses Blouse 8
3 3 Leather Horse Statues 1 Home/Home Décor/Home Décor Accents NaN 35.0 1 New with tags. Leather horses. Retail for [rm]... Home Home Décor Home Décor Accents 14
4 4 24K GOLD plated rose 1 Women/Jewelry/Necklaces NaN 44.0 0 Complete with certificate of authenticity Women Jewelry Necklaces 3
In [ ]:
df = train.groupby('desc_len')['price'].mean().reset_index()

Is the price associated with length of name?

In [ ]:
trace1 = go.Scatter(
    x = df['desc_len'],
    y = np.log(df['price']+1),
    mode = 'lines+markers',
    name = 'lines+markers'
)
layout = dict(title= 'Average Log(Price) by Description Length',
              yaxis = dict(title='Average Log(Price)'),
              xaxis = dict(title='Description Length'))
fig=dict(data=[trace1], layout=layout)
py.iplot(fig)
In [ ]:
train.item_description.isnull().sum()
Out[ ]:
4
In [ ]:
# remove missing values in item description
train = train[pd.notnull(train['item_description'])]
In [ ]:
# create a dictionary of words for each category

tokenize = nltk.data.load('tokenizers/punkt/english.pickle')  

cat_desc = dict()
for cat in general_cats: 
    text = " ".join(train.loc[train['general_cat']==cat, 'item_description'].values)
    cat_desc[cat] = tokenize.tokenize(text)

# flat list of all words combined
flat_lst = [item for sublist in list(cat_desc.values()) for item in sublist]
allWordsCount = Counter(flat_lst)
all_top10 = allWordsCount.most_common(20)
x = [w[0] for w in all_top10]
y = [w[1] for w in all_top10]
In [ ]:
trace1 = go.Bar(x=x, y=y, text=pct)
layout = dict(title= 'Word Frequency',
              yaxis = dict(title='Count'),
              xaxis = dict(title='Word'))
fig=dict(data=[trace1], layout=layout)
py.iplot(fig)

NLP

  • Tokenization
  • Removing stop words
  • Filtering and sorting
In [ ]:
stop = set(stopwords.words('english'))
def tokenize(text):
    """
    sent_tokenize(): segment text into sentences
    word_tokenize(): break sentences into words
    """
    try: 
        regex = re.compile('[' +re.escape(string.punctuation) + '0-9\\r\\t\\n]')
        text = regex.sub(" ", text) # remove punctuation
        
        tokens_ = [word_tokenize(s) for s in sent_tokenize(text)]
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent
        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        filtered_tokens = [w for w in tokens if re.search('[a-zA-Z]', w)]
        filtered_tokens = [w.lower() for w in filtered_tokens if len(w)>=3]
        
        return filtered_tokens
            
    except TypeError as e: print(text,e)
In [ ]:
# apply the tokenizer into the item descriptipn column
train['tokens'] = train['item_description'].map(tokenize)
test['tokens'] = test['item_description'].map(tokenize)
In [ ]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

Print the result

In [ ]:
for description, tokens in zip(train['item_description'].head(),
                              train['tokens'].head()):
    print('description:', description)
    print('tokens:', tokens)
    print()
description: No description yet
tokens: ['description', 'yet']

description: This keyboard is in great condition and works like it came out of the box. All of the ports are tested and work perfectly. The lights are customizable via the Razer Synapse app on your PC.
tokens: ['keyboard', 'great', 'condition', 'works', 'like', 'came', 'box', 'ports', 'tested', 'work', 'perfectly', 'lights', 'customizable', 'via', 'razer', 'synapse', 'app']

description: Adorable top with a hint of lace and a key hole in the back! The pale pink is a 1X, and I also have a 3X available in white!
tokens: ['adorable', 'top', 'hint', 'lace', 'key', 'hole', 'back', 'pale', 'pink', 'also', 'available', 'white']

description: New with tags. Leather horses. Retail for [rm] each. Stand about a foot high. They are being sold as a pair. Any questions please ask. Free shipping. Just got out of storage
tokens: ['new', 'tags', 'leather', 'horses', 'retail', 'stand', 'foot', 'high', 'sold', 'pair', 'questions', 'please', 'ask', 'free', 'shipping', 'got', 'storage']

description: Complete with certificate of authenticity
tokens: ['complete', 'certificate', 'authenticity']

Word cloud

In [ ]:
# build dictionary with key=category and values as all the descriptions related.
cat_desc = dict()
for cat in general_cats: 
    text = " ".join(train.loc[train['general_cat']==cat, 'item_description'].values)
    cat_desc[cat] = tokenize(text)


# find the most common words for the top 4 categories
women100 = Counter(cat_desc['Women']).most_common(100)
beauty100 = Counter(cat_desc['Beauty']).most_common(100)
kids100 = Counter(cat_desc['Kids']).most_common(100)
electronics100 = Counter(cat_desc['Electronics']).most_common(100)
In [ ]:
def generate_wordcloud(tup):
    wordcloud = WordCloud(background_color='white',
                          max_words=50, max_font_size=40,
                          random_state=42
                         ).generate(str(tup))
    return wordcloud
In [ ]:
fig,axes = plt.subplots(2, 2, figsize=(30, 15))

ax = axes[0, 0]
ax.imshow(generate_wordcloud(women100), interpolation="bilinear")
ax.axis('off')
ax.set_title("Women Top 100", fontsize=30)

ax = axes[0, 1]
ax.imshow(generate_wordcloud(beauty100))
ax.axis('off')
ax.set_title("Beauty Top 100", fontsize=30)

ax = axes[1, 0]
ax.imshow(generate_wordcloud(kids100))
ax.axis('off')
ax.set_title("Kids Top 100", fontsize=30)

ax = axes[1, 1]
ax.imshow(generate_wordcloud(electronics100))
ax.axis('off')
ax.set_title("Electronic Top 100", fontsize=30)
Out[ ]:
Text(0.5,1,'Electronic Top 100')

tf-idf

In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=10,
                             max_features=180000,
                             tokenizer=tokenize,
                             ngram_range=(1, 2))
In [ ]:
all_desc = np.append(train['item_description'].values, test['item_description'].values)
vz = vectorizer.fit_transform(list(all_desc))
In [ ]:
vz.shape
Out[ ]:
(2175890, 180000)
In [ ]:
#  create a dictionary mapping the tokens to their tfidf values
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(
                    dict(tfidf), orient='index')
tfidf.columns = ['tfidf']

The following are the 10 lowest-scoring tags using tf-idf, which is not surprising as they are very common words that are unable to differentiate one description from another.

In [ ]:
tfidf.sort_values(by=['tfidf'], ascending=True).head(10)
Out[ ]:
tfidf
new 2.175653
size 2.330674
brand 2.755660
condition 2.799306
brand new 2.874418
free 2.903426
shipping 3.070592
worn 3.107882
used 3.165310
never 3.276901

The following are the 10 highest-scoring tags using tf-idf, which include many specific words. By looking at them, we can guess their respective categories:

In [ ]:
tfidf.sort_values(by=['tfidf'], ascending=False).head(10)
Out[ ]:
tfidf
postnatal 13.195054
subdrip rda 13.195054
lmt 13.195054
lbs length 13.195054
place step 13.195054
light volts 13.195054
thumb point 13.195054
wedgwood 13.195054
novelty bill 13.195054
colour brow 13.195054

Given the high dimensionality of our tf-idf matrix, we need to use Singular Value Decomposition (SVD) techniques to reduce its dimensions. In order to make our vocabulary visualizable, we can then use t-SNE to reduce the dimensions from 50 down to 2. t-SNE is particularly well-suited for reducing dimensions down to 2 or 3.

In [ ]:
trn = train.copy()
tst = test.copy()
trn['is_train'] = 1
tst['is_train'] = 0

sample_sz = 15000

combined_df = pd.concat([trn, tst])
combined_sample = combined_df.sample(n=sample_sz)
vz_sample = vectorizer.fit_transform(list(combined_sample['item_description']))
In [ ]:
from sklearn.decomposition import TruncatedSVD

n_comp=30
svd = TruncatedSVD(n_components=n_comp, random_state=42)
svd_tfidf = svd.fit_transform(vz_sample)
In [ ]:
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=42, n_iter=500)
In [ ]:
tsne_tfidf = tsne_model.fit_transform(svd_tfidf)
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 15000 samples in 0.021s...
[t-SNE] Computed neighbors for 15000 samples in 7.434s...
[t-SNE] Computed conditional probabilities for sample 1000 / 15000
[t-SNE] Computed conditional probabilities for sample 2000 / 15000
[t-SNE] Computed conditional probabilities for sample 3000 / 15000
[t-SNE] Computed conditional probabilities for sample 4000 / 15000
[t-SNE] Computed conditional probabilities for sample 5000 / 15000
[t-SNE] Computed conditional probabilities for sample 6000 / 15000
[t-SNE] Computed conditional probabilities for sample 7000 / 15000
[t-SNE] Computed conditional probabilities for sample 8000 / 15000
[t-SNE] Computed conditional probabilities for sample 9000 / 15000
[t-SNE] Computed conditional probabilities for sample 10000 / 15000
[t-SNE] Computed conditional probabilities for sample 11000 / 15000
[t-SNE] Computed conditional probabilities for sample 12000 / 15000
[t-SNE] Computed conditional probabilities for sample 13000 / 15000
[t-SNE] Computed conditional probabilities for sample 14000 / 15000
[t-SNE] Computed conditional probabilities for sample 15000 / 15000
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 89.433113
[t-SNE] Error after 500 iterations: 1.920846
In [ ]:
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600,
                       title="tf-idf clustering of the item description",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)
Loading BokehJS ...
In [ ]:
 
In [ ]:
combined_sample.reset_index(inplace=True, drop=True)
In [ ]:
tfidf_df = pd.DataFrame(tsne_tfidf, columns=['x', 'y'])
tfidf_df['description'] = combined_sample['item_description']
tfidf_df['tokens'] = combined_sample['tokens']
tfidf_df['category'] = combined_sample['general_cat']
In [ ]:
plot_tfidf.scatter(x='x', y='y', source=tfidf_df, alpha=0.7)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"description": "@description", "tokens": "@tokens", "category":"@category"}
show(plot_tfidf)

K-Means Clustering

In [ ]:
from sklearn.cluster import MiniBatchKMeans

num_clusters = 10 # need to be selected wisely
kmeans_model = MiniBatchKMeans(n_clusters=num_clusters,
                               init='k-means++',
                               n_init=1,
                               init_size=1000, batch_size=1000, verbose=0, max_iter=1000)
In [ ]:
kmeans = kmeans_model.fit(vz)
kmeans_clusters = kmeans.predict(vz)
kmeans_distances = kmeans.transform(vz)
In [ ]:
sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
In [ ]:
# repeat the same steps for the sample
kmeans = kmeans_model.fit(vz_sample)
kmeans_clusters = kmeans.predict(vz_sample)
kmeans_distances = kmeans.transform(vz_sample)
# reduce dimension to 2 using tsne
tsne_kmeans = tsne_model.fit_transform(kmeans_distances)
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 15000 samples in 0.013s...
[t-SNE] Computed neighbors for 15000 samples in 1.513s...
[t-SNE] Computed conditional probabilities for sample 1000 / 15000
[t-SNE] Computed conditional probabilities for sample 2000 / 15000
[t-SNE] Computed conditional probabilities for sample 3000 / 15000
[t-SNE] Computed conditional probabilities for sample 4000 / 15000
[t-SNE] Computed conditional probabilities for sample 5000 / 15000
[t-SNE] Computed conditional probabilities for sample 6000 / 15000
[t-SNE] Computed conditional probabilities for sample 7000 / 15000
[t-SNE] Computed conditional probabilities for sample 8000 / 15000
[t-SNE] Computed conditional probabilities for sample 9000 / 15000
[t-SNE] Computed conditional probabilities for sample 10000 / 15000
[t-SNE] Computed conditional probabilities for sample 11000 / 15000
[t-SNE] Computed conditional probabilities for sample 12000 / 15000
[t-SNE] Computed conditional probabilities for sample 13000 / 15000
[t-SNE] Computed conditional probabilities for sample 14000 / 15000
[t-SNE] Computed conditional probabilities for sample 15000 / 15000
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 81.781433
[t-SNE] Error after 500 iterations: 1.791430
In [ ]:
#combined_sample.reset_index(drop=True, inplace=True)
kmeans_df = pd.DataFrame(tsne_kmeans, columns=['x', 'y'])
kmeans_df['cluster'] = kmeans_clusters
kmeans_df['description'] = combined_sample['item_description']
kmeans_df['category'] = combined_sample['general_cat']
#kmeans_df['cluster']=kmeans_df.cluster.astype(str).astype('category')
In [ ]:
plot_kmeans = bp.figure(plot_width=700, plot_height=600,
                        title="KMeans clustering of the description",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)
In [ ]:
kmeans_clusters
Out[ ]:
array([8, 4, 6, ..., 5, 4, 5])
In [ ]:
colormap = {'0': 'red', '1': 'green', '2': 'blue', '3': 'black', '4': 'yellow', '5': 'pink', '6': 'purple', '7': 'grey', '8': 'brown', '9': 'orange'}
def get_color(num):
    if num == 0:
        return 'red'
    elif num == 1:
        return 'green'
    elif num == 2:
        return 'blue'
    elif num == 3:
        return 'black'
    elif num == 4:
        return 'yellow'
    elif num == 5:
        return 'pink'
    elif num == 6:
        return 'purple'
    elif num == 7:
        return 'grey'
    elif num == 8:
        return 'brown'
    elif num == 9:
        return 'orange'
color = pd.Series(kmeans_clusters).apply(get_color)
In [ ]:
colormap = {'0': 'red', '1': 'green', '2': 'blue', '3': 'black', '4': 'yellow'}
source = ColumnDataSource(data=dict(x=kmeans_df['x'], y=kmeans_df['y'],
                                    color=color,
                                    description=kmeans_df['description'],
                                    category=kmeans_df['category'],
                                    cluster=kmeans_df['cluster']))

plot_kmeans.scatter(x='x', y='y', color='color', source=source)
hover = plot_kmeans.select(dict(type=HoverTool))
hover.tooltips={"description": "@description", "category": "@category", "cluster":"@cluster" }
show(plot_kmeans)

LDA Model

The input is bag of words

In [ ]:
cvectorizer = CountVectorizer(min_df=4,
                              max_features=180000,
                              tokenizer=tokenize,
                              ngram_range=(1,2))
In [ ]:
cvz = cvectorizer.fit_transform(combined_sample['item_description'])
In [ ]:
lda_model = LatentDirichletAllocation(n_components=10,
                                      learning_method='online',
                                      max_iter=20,
                                      random_state=42)
In [ ]:
X_topics = lda_model.fit_transform(cvz)
In [ ]:
n_top_words = 10
topic_summaries = []

topic_word = lda_model.components_  # get the topic words
vocab = cvectorizer.get_feature_names()

for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))
    print('Topic {}: {}'.format(i, ' | '.join(topic_words)))
Topic 0: green | silver | pink | blue | one | white | matte | black | purple | set
Topic 1: inches | makeup | made | dunn | care | rae | rae dunn | bag | long | packaging
Topic 2: case | iphone | plus | size | picture | quality | know | high | note | let
Topic 3: like | new | price | size | bundle | like new | firm | dress | long | price firm
Topic 4: condition | great | size | good | used | worn | free | great condition | home | good condition
Topic 5: new | brand | brand new | used | never | never used | box | color | new never | authentic
Topic 6: description | yet | description yet | gold | charger | apple | comes | bracelet | disney | included
Topic 7: please | shipping | free | bundle | items | price | item | ask | new | ship
Topic 8: size | new | tags | brand | worn | black | small | brand new | pink | medium
Topic 9: shipping | free | free shipping | great | fast | price | game | clean | scratches | includes
In [ ]:
# reduce dimension to 2 using tsne
tsne_lda = tsne_model.fit_transform(X_topics)
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 15000 samples in 0.013s...
[t-SNE] Computed neighbors for 15000 samples in 1.870s...
[t-SNE] Computed conditional probabilities for sample 1000 / 15000
[t-SNE] Computed conditional probabilities for sample 2000 / 15000
[t-SNE] Computed conditional probabilities for sample 3000 / 15000
[t-SNE] Computed conditional probabilities for sample 4000 / 15000
[t-SNE] Computed conditional probabilities for sample 5000 / 15000
[t-SNE] Computed conditional probabilities for sample 6000 / 15000
[t-SNE] Computed conditional probabilities for sample 7000 / 15000
[t-SNE] Computed conditional probabilities for sample 8000 / 15000
[t-SNE] Computed conditional probabilities for sample 9000 / 15000
[t-SNE] Computed conditional probabilities for sample 10000 / 15000
[t-SNE] Computed conditional probabilities for sample 11000 / 15000
[t-SNE] Computed conditional probabilities for sample 12000 / 15000
[t-SNE] Computed conditional probabilities for sample 13000 / 15000
[t-SNE] Computed conditional probabilities for sample 14000 / 15000
[t-SNE] Computed conditional probabilities for sample 15000 / 15000
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 86.734573
[t-SNE] Error after 500 iterations: 2.042183
In [ ]:
unnormalized = np.matrix(X_topics)
doc_topic = unnormalized/unnormalized.sum(axis=1)

lda_keys = []
for i, tweet in enumerate(combined_sample['item_description']):
    lda_keys += [doc_topic[i].argmax()]

lda_df = pd.DataFrame(tsne_lda, columns=['x','y'])
lda_df['description'] = combined_sample['item_description']
lda_df['category'] = combined_sample['general_cat']
lda_df['topic'] = lda_keys
lda_df['topic'] = lda_df['topic'].map(int)
In [ ]:
plot_lda = bp.figure(plot_width=700,
                     plot_height=600,
                     title="LDA topic visualization",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)
In [ ]:
colormap = {'0': 'red', '1': 'green', '2': 'blue', '3': 'black', '4': 'yellow', '5': 'pink', '6': 'purple', '7': 'grey', '8': 'brown', '9': 'orange'}
def get_color(num):
    if num == 0:
        return 'red'
    elif num == 1:
        return 'green'
    elif num == 2:
        return 'blue'
    elif num == 3:
        return 'black'
    elif num == 4:
        return 'yellow'
    elif num == 5:
        return 'pink'
    elif num == 6:
        return 'purple'
    elif num == 7:
        return 'grey'
    elif num == 8:
        return 'brown'
    elif num == 9:
        return 'orange'
color = pd.Series(lda_keys).apply(get_color)
In [ ]:
source = ColumnDataSource(data=dict(x=lda_df['x'], y=lda_df['y'],
                                    color=color,
                                    description=lda_df['description'],
                                    topic=lda_df['topic'],
                                    category=lda_df['category']))

plot_lda.scatter(source=source, x='x', y='y', color='color')
hover = plot_kmeans.select(dict(type=HoverTool))
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips={"description":"@description",
                "topic":"@topic", "category":"@category"}
show(plot_lda)
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: